Contents

%run set_theme.ipynb
import plotly.graph_objs as go
import plotly.express as px
import pandas as pd
from plotly.offline import init_notebook_mode

init_notebook_mode()
df = pd.read_parquet('../data/SO_2014_2022.pq')
avg_salary = df.groupby(['YearsCodePro', 'Gender'])['Salary'].mean().reset_index()
avg_salary
YearsCodePro Gender Salary
0 0 female 39553.009805
1 0 male 32276.145762
2 1 female 39444.444886
3 1 male 32361.590644
4 2 female 41009.217794
... ... ... ...
95 48 male 114928.0
96 49 female 8.0
97 49 male 117607.3
98 50 female 133300.0
99 50 male 84511.262295

100 rows × 3 columns

smoothed_avg = pd.DataFrame(columns=['Gender', 'Salary'])

for gender in ['male', 'female']:
    start_salary = df.query(f"YearsCodePro < 2 & Gender == '{gender}'")['Salary'].mean()
    x = list(avg_salary.query(f"Gender == '{gender}'").drop(columns=['Gender']).div({'YearsCodePro':1, 'Salary':start_salary}).sub({'YearsCodePro':0, 'Salary':1}).rolling(3, on='YearsCodePro'))
    smoothed = pd.Series(row.mean()['Salary'].round(3) for row in x[2:]).rename('Salary')
    # smoothed = avg_salary.query(f"Gender == '{gender}'")['Salary'].reset_index(drop=True).div(start_salary).sub(1).rename('Salary')
    gen_col = pd.Series(gender for _ in range(len(smoothed))).rename('Gender')
    smoothed.index += 1
    gen_col.index += 1
    smoothed_avg = pd.concat([smoothed_avg, pd.concat([gen_col, smoothed], axis=1)])

smoothed_avg.reset_index(names='YearsCodePro', inplace=True)
smoothed_avg
C:\Users\Efe\AppData\Local\Temp\ipykernel_4340\510673352.py:4: RuntimeWarning:

Engine has switched to 'python' because numexpr does not support extension array dtypes. Please set your engine to python manually.

C:\Users\Efe\AppData\Local\Temp\ipykernel_4340\510673352.py:11: FutureWarning:

The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.
C:\Users\Efe\AppData\Local\Temp\ipykernel_4340\510673352.py:4: RuntimeWarning:

Engine has switched to 'python' because numexpr does not support extension array dtypes. Please set your engine to python manually.
YearsCodePro Gender Salary
0 1 male 0.029
1 2 male 0.164
2 3 male 0.302
3 4 male 0.530
4 5 male 0.712
... ... ... ...
91 43 female 1.304
92 44 female 1.970
93 45 female 1.407
94 46 female 0.605
95 47 female 0.801

96 rows × 3 columns

fig = px.line(
    smoothed_avg,
    x="YearsCodePro",
    y='Salary',
    color='Gender',
    color_discrete_map={
        'male': '#5b6fec',
        'female': '#f854ee'
    },
    range_x=[1, 40],
    range_y=[0, 3],
)

fig.update_layout(
    xaxis_title="Years coded professionally",
    yaxis_title="Average salary increase",
    title='Coding Experience vs Relative Salary Increase<br><sup>After five years men receive higher salary increase for the same work experience</sup>',
    yaxis_tickformat='.0%'
)

fig.update_layout(
    hovermode="x",
    hoverlabel={'font_color': 'white', 'bordercolor': 'white'},
    legend={
        'title': '',
        'orientation': 'h',
        'xanchor': 'center',
        'yanchor': 'top',
        'x': 0.47,
        'y': 1.085,
        'itemwidth': 45
    },
    margin={'t': 100, 'r': 50, 'b': 130, 'l': 110},
    width=790,
)

# Add caption
fig.add_annotation(x=-0.12, y=-0.35,
                   showarrow=False,
                   xref='paper', yref='paper',
                   xanchor='left', yanchor='bottom',
                   align='left',
                   text='This shows the years of professional coding experience compared to salary increase relative to starting salary.<br>' +
                        'Hover over the graph from left to right to compare the increase for both genders.')

for trace in fig.data:
    trace.hovertemplate = '<br>Salary increase: %{y}<extra></extra>'

fig.show()